{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "from bs4 import BeautifulSoup\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer\n",
    "import lightgbm as lgb\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "stemmer = WordNetLemmatizer()\n",
    "from scipy import sparse\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import StrMethodFormatter\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from math import sqrt\n",
    "from gensim.models import Word2Vec\n",
    "import nltk\n",
    "tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_to_words(text,remove_stopwords=False):\n",
    "    text=BeautifulSoup(text).get_text()\n",
    "    text=re.sub(\"[^a-zA-Z]\",\" \",text)\n",
    "    words=text.lower().split()\n",
    "    if(remove_stopwords==True):\n",
    "        stop_words=stopwords.words('english')\n",
    "        words=[x for x in words if x not in stop_words]\n",
    "    return words\n",
    "\n",
    "def preprocess_to_sentences(text,tokenizer):\n",
    "    raw_sentences=tokenizer.tokenize(text.strip())\n",
    "    sentences=[]\n",
    "    for i in raw_sentences:\n",
    "        if(len(i)!=0):\n",
    "            sentences.append(preprocess_to_words(i,False))\n",
    "    return sentences\n",
    "\n",
    "def makeFeatureVec(words,model,num_features):\n",
    "    featureVec=np.zeros((num_features,),dtype=\"float32\")\n",
    "    nwords=0\n",
    "    index2word_set=set(model.wv.index2word)\n",
    "    for word in words:\n",
    "        if word in index2word_set: \n",
    "            nwords=nwords+1\n",
    "            featureVec=np.add(featureVec,model[word])\n",
    "    featureVec=np.divide(featureVec,nwords)\n",
    "    return featureVec\n",
    "\n",
    "def getAvgFeatureVecs(reviews,model,num_features):\n",
    "    counter=0\n",
    "    reviewFeatureVecs=np.zeros((len(reviews),num_features),dtype=\"float32\")\n",
    "    for review in reviews:\n",
    "        reviewFeatureVecs[counter]=makeFeatureVec(review,model,num_features)\n",
    "        counter=counter+1\n",
    "    return reviewFeatureVecs\n",
    "\n",
    "def build_word2vec_model(train,test,num_of_features,model_type):\n",
    "    print(\"Extracting sentences from training data\")\n",
    "    total_sentences=[]\n",
    "    cnt=0\n",
    "    for i in train[model_type]:\n",
    "        total_sentences+=preprocess_to_sentences(i,tokenizer)\n",
    "        if(cnt%200==0):\n",
    "            print(str(cnt)+\" rows done!\")\n",
    "        cnt+=1\n",
    "\n",
    "    print(\"\\n\\nExtracting sentences from test data\")\n",
    "    cnt=0\n",
    "    for i in test[model_type]:\n",
    "        total_sentences+=preprocess_to_sentences(i,tokenizer)\n",
    "        if(cnt%200==0):\n",
    "            print(str(cnt)+\" rows done\")\n",
    "        cnt+=1\n",
    "    word2vec_model = Word2Vec(total_sentences, workers=4, size=num_of_features, min_count = 3, window = 10, sample = 1e-3)\n",
    "    return word2vec_model\n",
    "\n",
    "def extract_word2vec_features(train,test,word2vec_model,num_of_features,model_type):\n",
    "    \n",
    "    train_words=train[model_type].apply(preprocess_to_words,args=(True,))\n",
    "    avg_feature_vectors_train=getAvgFeatureVecs(train_words,word2vec_model,num_of_features)\n",
    "    \n",
    "    test_words=test[model_type].apply(preprocess_to_words,args=(True,))\n",
    "    avg_feature_vectors_test=getAvgFeatureVecs(test_words,word2vec_model,num_of_features)\n",
    "    \n",
    "    train_word2vec_df=pd.DataFrame(avg_feature_vectors_train,columns=[model_type+str(x) for x in range(1,num_of_features+1)])\n",
    "    test_word2vec_df=pd.DataFrame(avg_feature_vectors_test,columns=[model_type+str(x) for x in range(1,num_of_features+1)])\n",
    "    \n",
    "    return train_word2vec_df,test_word2vec_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Train02.csv', encoding='ISO-8859-1')\n",
    "test_orig = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\Data_Test02.csv', encoding='ISO-8859-1')\n",
    "FeatureNames = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\Participants_Data\\\\FeatureNames02.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')\n",
    "test_other_models = pd.read_csv('C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20190930_XGB01_DS.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]\n",
    "test_other_models = test_other_models[['id','Price_Log_Pred']]\n",
    "\n",
    "train = pd.merge(train_orig, train_other_models, on='id')\n",
    "test = pd.merge(test_orig, test_other_models, on='id')\n",
    "\n",
    "train['Price_Log'] = np.log10(train['Price']+1)\n",
    "train.hist(column='Price_Log')\n",
    "\n",
    "FeatureNames = FeatureNames['x'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "word2vec_model = build_word2vec_model(train, test, 300, 'Synopsis')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "trn_mean_emb, tst_mean_emb = extract_word2vec_features(train, test, word2vec_model, 300, 'Synopsis')\n",
    "print(\"Train Shape : \",trn_mean_emb.shape)\n",
    "print(\"Test Shape : \",tst_mean_emb.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FeatureNames2 = list(trn_mean_emb.columns)\n",
    "FeatureNames2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.concat([train,trn_mean_emb], axis=1)\n",
    "test = pd.concat([test,tst_mean_emb], axis=1)\n",
    "print(\"Train Shape : \",train.shape)\n",
    "print(\"Test Shape : \",test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(train.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_list = list(train.FOLD_NUM.unique())\n",
    "fold_list.sort()\n",
    "fold_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "param = {\n",
    "    'learning_rate': 0.05,\n",
    "    'max_depth': 14,\n",
    "    'min_data_in_leaf': 3,\n",
    "    'bagging_freq': 0,\n",
    "    'bagging_fraction': 0.95,\n",
    "    'feature_fraction': 0.2,\n",
    "    'boost': 'gbdt',    \n",
    "    'objective': 'regression',\n",
    "    'metric': 'rmse',\n",
    "    'seed': 392\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    trn_data = lgb.Dataset(temp_train_tf, label=temp_train['Price_Log'])\n",
    "    val_data = lgb.Dataset(temp_val_tf, label=temp_val['Price_Log'])\n",
    "    \n",
    "    model = lgb.train(param, trn_data, 2000000,valid_sets = val_data, verbose_eval=50, early_stopping_rounds = 300)\n",
    "    \n",
    "    temp_val['Price_Log_Pred_LGB'] = model.predict(temp_val_tf, num_iteration=model.best_iteration)\n",
    "    \n",
    "    print(\"Fold RMSLE = \",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + model.predict(test[FeatureNames+FeatureNames2].values, num_iteration=model.best_iteration)\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB'])))\n",
    "# LB SCORE : 0.7791"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = train[['id','FOLD_NUM','Price','Price_Log']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_LGB'] = sub_data_preds\n",
    "test['Price_Log_Pred_LGB'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.models import Sequential, load_model\n",
    "from keras.layers import Dense, Dropout\n",
    "from keras.layers.normalization import BatchNormalization\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def nn_model(input_shape):\n",
    "    model = Sequential()\n",
    "    model.add(Dense(200, input_dim=input_shape, activation='relu'))\n",
    "    model.add(Dropout(0.2))\n",
    "    model.add(BatchNormalization())\n",
    "    model.add(Dense(200, activation='relu'))\n",
    "    model.add(Dropout(0.2))\n",
    "    model.add(BatchNormalization())\n",
    "    model.add(Dense(100, activation='relu'))\n",
    "    model.add(Dropout(0.1))\n",
    "    model.add(BatchNormalization())\n",
    "    model.add(Dense(1))\n",
    "    return(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chk1 = pd.isnull(train[FeatureNames+FeatureNames2]).sum()\n",
    "chk1[chk1 > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.fillna(0,inplace=True)\n",
    "test.fillna(0,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chk2 = pd.isnull(train[FeatureNames+FeatureNames2]).sum()\n",
    "chk2[chk2 > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Ridge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    ridgereg = Ridge(alpha=0.175,normalize=True)\n",
    "    ridgereg.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_RIDGE'] = ridgereg.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RIDGE'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = ridgereg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + ridgereg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "\n",
    "    IterationNum = IterationNum + 1\n",
    "\n",
    "print(\"RIDGE 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RIDGE'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RIDGE']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_RIDGE'] = sub_data_preds\n",
    "test['Price_Log_Pred_RIDGE'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import Lasso"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    lassoreg = Lasso(alpha=0.00002,normalize=True, max_iter=1e6)\n",
    "    lassoreg.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_LASSO'] = lassoreg.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LASSO'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = lassoreg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + lassoreg.predict(test[FeatureNames+FeatureNames2].values)\n",
    "\n",
    "    IterationNum = IterationNum + 1\n",
    "    \n",
    "print(\"LASSO 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LASSO'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LASSO']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_LASSO'] = sub_data_preds\n",
    "test['Price_Log_Pred_LASSO'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    KNN = KNeighborsRegressor(n_neighbors = 20)\n",
    "    KNN.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_KNN'] = KNN.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_KNN'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = KNN.predict(test[FeatureNames+FeatureNames2].values)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + KNN.predict(test[FeatureNames+FeatureNames2].values)\n",
    "\n",
    "    IterationNum = IterationNum + 1\n",
    "\n",
    "print(\"KNN 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_KNN'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_KNN']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_KNN'] = sub_data_preds\n",
    "test['Price_Log_Pred_KNN'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    RF = RandomForestRegressor(n_estimators = 100, min_samples_leaf = 3, max_features = 60, random_state = 412,\n",
    "                               verbose = 0, max_depth = 40)\n",
    "    RF.fit(temp_train_tf,temp_train['Price_Log'])\n",
    "    \n",
    "    temp_val['Price_Log_Pred_RF'] = RF.predict(temp_val_tf)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_RF'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = RF.predict(test[FeatureNames+FeatureNames2])\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + RF.predict(test[FeatureNames+FeatureNames2])\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"RF 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_RF'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_RF']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_RF'] = sub_data_preds\n",
    "test['Price_Log_Pred_RF'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    temp_train_tf = temp_train[FeatureNames+FeatureNames2].values\n",
    "    temp_val_tf = temp_val[FeatureNames+FeatureNames2].values\n",
    "    \n",
    "    dtrain = xgb.DMatrix(data = temp_train_tf, label = temp_train['Price_Log'])\n",
    "    dtest = xgb.DMatrix(data = temp_val_tf, label = temp_val['Price_Log'])\n",
    "                \n",
    "    watchlist = [(dtrain, 'train'), (dtest, 'eval')]\n",
    "    \n",
    "    params1 = { 'seed': 501,\n",
    "                'colsample_bytree': 0.2,\n",
    "                'verbosity': 1,\n",
    "                'subsample': 0.95,\n",
    "                'learning_rate': 0.05,\n",
    "                'objective': 'reg:squarederror',\n",
    "                'max_depth': 10,\n",
    "                'min_child_weight': 3,\n",
    "                'booster': 'gbtree',\n",
    "                'eval_metric': 'rmse' }\n",
    "    \n",
    "    XGB = xgb.train(params = params1,\n",
    "                    dtrain = dtrain,\n",
    "                    num_boost_round = 10000,\n",
    "                    evals = watchlist,\n",
    "                    verbose_eval = 20,\n",
    "                    early_stopping_rounds = 100)\n",
    "    \n",
    "    temp_val['Price_Log_Pred_XGB'] = XGB.predict(xgb.DMatrix(data = temp_val_tf), ntree_limit = XGB.best_ntree_limit)\n",
    "    \n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_XGB'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = XGB.predict(xgb.DMatrix(data = test[FeatureNames+FeatureNames2].values), ntree_limit = XGB.best_ntree_limit)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + XGB.predict(xgb.DMatrix(data = test[FeatureNames+FeatureNames2].values), ntree_limit = XGB.best_ntree_limit)\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"XGB 01 CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_XGB'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_XGB']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_XGB'] = sub_data_preds\n",
    "test['Price_Log_Pred_XGB'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20191007_Stack05_DS.csv\", index=False)\n",
    "test.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20191007_Stack05_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names_ensemble = list(training_cv_predictions.columns[4:])\n",
    "feature_names_ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corr = training_cv_predictions[feature_names_ensemble + ['Price_Log']].corr()\n",
    "ax = sns.heatmap(\n",
    "    corr, \n",
    "    vmin=0, vmax=1, center=0.5,\n",
    "    cmap=sns.diverging_palette(20, 220, n=200),\n",
    "    square=True\n",
    ")\n",
    "ax.set_xticklabels(\n",
    "    ax.get_xticklabels(),\n",
    "    rotation=45,\n",
    "    horizontalalignment='right'\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_ensemble = {\n",
    "    'learning_rate': 0.001,\n",
    "    'max_depth': 3,\n",
    "    'min_data_in_leaf': 3,\n",
    "    'bagging_freq': 5,\n",
    "    'bagging_fraction': 0.3,\n",
    "    'feature_fraction': 0.5,\n",
    "    'boost': 'gbdt',    \n",
    "    'objective': 'regression',\n",
    "    'metric': 'rmse',\n",
    "    'seed': 1234\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IterationNum = 1\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running CV Iteration Num :\", IterationNum)\n",
    "    \n",
    "    temp_train = training_cv_predictions[training_cv_predictions['FOLD_NUM'] != fold_num]\n",
    "    temp_val = training_cv_predictions[training_cv_predictions['FOLD_NUM'] == fold_num]\n",
    "    \n",
    "    trn_data = lgb.Dataset(temp_train[feature_names_ensemble], label=temp_train['Price_Log'])\n",
    "    val_data = lgb.Dataset(temp_val[feature_names_ensemble], label=temp_val['Price_Log'])\n",
    "    \n",
    "    model_e = lgb.train(param_ensemble,\n",
    "                        trn_data,\n",
    "                        20000000,\n",
    "                        valid_sets = val_data,\n",
    "                        verbose_eval=50,\n",
    "                        early_stopping_rounds = 300)\n",
    "    \n",
    "    temp_val['Price_Log_Pred_LGB_ENS'] = model_e.predict(temp_val[feature_names_ensemble],\n",
    "                                                         num_iteration=model_e.best_iteration)\n",
    "    \n",
    "    print(\"Fold RMSLE = \",sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_LGB_ENS'])))\n",
    "    \n",
    "    if(IterationNum == 1):\n",
    "        CV_SCORED_DATA = temp_val.copy(deep=True)\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = model_e.predict(test[feature_names_ensemble], num_iteration=model_e.best_iteration)\n",
    "    else:\n",
    "        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,temp_val])\n",
    "        CV_SCORED_DATA.reset_index(drop = True, inplace = True)\n",
    "        sub_data_preds = sub_data_preds + model_e.predict(test[feature_names_ensemble],\n",
    "                                                          num_iteration=model_e.best_iteration)\n",
    "\n",
    "    IterationNum = IterationNum + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"LGB ENS CV RMSLE = \",sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))\n",
    "print(\"CV 1-RMSLE = \",1-sqrt(mean_squared_error(CV_SCORED_DATA['Price_Log'], CV_SCORED_DATA['Price_Log_Pred_LGB_ENS'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions = pd.merge(training_cv_predictions, CV_SCORED_DATA[['id','Price_Log_Pred_LGB_ENS']], on='id')\n",
    "training_cv_predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sub_data_preds = sub_data_preds / len(fold_list)\n",
    "test['Price_Log_Pred_LGB_ENS'] = sub_data_preds\n",
    "test['Price_Log_Pred_LGB_ENS'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_predictions.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Trn Datasets\\\\20191007_Stack05_DS.csv\", index=False)\n",
    "test.to_csv(\"C:\\\\Kaggle\\\\BooksPrice\\\\CV Scrd Tst Datasets\\\\20191007_Stack05_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = pd.read_excel('C:/Kaggle/BooksPrice/Participants_Data/Sample_Submission.xlsx', encoding='ISO-8859-1')\n",
    "\n",
    "sub_data_preds2 = (10**test['Price_Log_Pred_LGB_ENS'].values) - 1\n",
    "pd.DataFrame(sub_data_preds2).describe()\n",
    "\n",
    "submission['Price'] = sub_data_preds2\n",
    "submission.to_excel('C:\\\\Kaggle\\\\BooksPrice\\\\Submissions\\\\20191007_Stack05_DS.xlsx', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(sub_data_preds2).describe()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
